################################################################################ 
#
# The distribution of hate speech and its implications for content moderation
# PSRM - Replication package
# Table E11
#
################################################################################ 


################################################################################ 
#  LIBRARIES
################################################################################ 

library(hdm)
library(dplyr)
library(readr)
library(gtools)

rm(list = setdiff(ls(), ls(pattern = "^wd|^setsave$")))

################################################################################ 
#   DATA AND FOLDER
################################################################################ 

# read
data = read.csv(paste0(wd_data_processed, '/for_analysis/clean_for_analysis.csv'))
lassodata = read_rds(paste0(wd_data_processed, '/for_analysis/data_lasso.RDS'))

# drop attrited users
data = data[which(data$attrition==0 & data$not_auth_sofar==0 & data$suspended_sofar==0 & data$protected_sofar==0),]


################################################################################ 
#   VARIABLES as in the PAP
################################################################################

outvars = c('anchor_deleted_12h', 'no_hate_tweets_post_85', 'hate_rate_85', 'post_tweets_h_avg')
outvars_name = c('Hate Tweet Deleted <12h', '# Hate Tweets', 'Share of daily hate', 'Probability of Hate Speech') 

treatments = c('alert', 'consequences', 'empathy')
treatments_lab = c( 'Alerting of Hate Speech', 'Warning of Consequences', 'Empathy')



################################################################################ 
#   Scale variables
################################################################################ 

data[c(outvars)] = lapply(data[c(outvars)], scale)
lassodata = as.data.frame(lapply(lassodata[,2:ncol(lassodata)], scale))


################################################################################ 
#   Functions
################################################################################ 

# Regression model
regresssion = function(outvars, treatments, group_ind){
  
  n = 1
  MODELS_controls = list()
  
  data_temp = data[which(data$group %in% group_ind),]
  lassodata_temp = lassodata[which(data$group %in% group_ind),]
  
  print(paste0("subset length is: ", nrow(data_temp)))
  
  for (j in 1:length(outvars)){
    for (i in 1:length(treatments)){
      
      droper = union(which(is.na(data_temp[outvars[j]])), which(data_temp[outvars[j]]==Inf))
      keeper = union(which(data_temp$treat_aggregate == treatments[i]),  which(data_temp$treat_aggregate == 'control')) 
      keeper = setdiff(keeper, droper)
      
      y = data_temp[keeper, outvars[j]]
      d = data_temp$treated[keeper]
      x = lassodata_temp[keeper, ] %>% as.matrix()
      
      print('Data ready')
      
      c = rep(FALSE, ncol(x))
      c[c(2,3)] = TRUE
      
      MODELS_controls[[n]] = rlassoEffect(x, y, d, method = "double selection", 
                                          I3 = c)
      
      print('Done with: ')
      print(n)
      
      print('Done with: ')
      print(n)
      
      n = n+1
    }}
  return(MODELS_controls)
}


# Outcome cleaning
make_lasso_container = function(list_to_be_transformed, outvars, outvars_name){
  MOD = list_to_be_transformed
  coef = sapply(MOD, function(m) summary(m)[[1]][1])
  se = sapply(MOD, function(m) summary(m)[[1]][2])
  pval = sapply(MOD, function(m) summary(m)[[1]][4])
  regnum =  sapply(MOD, function(m) length(m$coefficients.reg)-2)
  sample = sapply(MOD, function(m) m$samplesize)
  controls = sapply(MOD, function(m) paste0(names(m$coefficients.reg)[-c(1, 2)], collapse = ', '))
  container = cbind.data.frame(yvar = rep(outvars, each=length(treatments)),
                               yvar_name = rep(outvars_name, each=length(treatments)),
                               treat = rep(treatments, time=length(outvars)),
                               treatments_lab = rep(treatments_lab, time=length(outvars)),
                               coef = coef, se=se, pval=pval,
                               samplesize = sample,
                               lasso_cont_num = regnum,
                               controls = controls )
  return(container)}



################################################################################
# Heterogeneity over median past hate
################################################################################

# Adjust variable
data$hate_pre = data$no_hate_tweets_pre   # both variables refer to the 30 days before intervention
data$hate_pre[data$no_tweets_pre==0] = 0  # 3 users did not post anything

# Create group indicator
groups = c(1, 2)
data$group = quantcut(data$hate_pre, q = 2, na.rm = TRUE, labels=groups)

# Run regressions
MODELS1 = regresssion(outvars, treatments, c(1))
MODELS2 = regresssion(outvars, treatments, c(2))

# Clean results
container1 = make_lasso_container(MODELS1, outvars, outvars_name)
container1$subset = "1/2 quantiles of Hate"

container2 = make_lasso_container(MODELS2, outvars, outvars_name)
container2$subset = "2/2 quantiles of Hate"

container = rbind(container1,container2)

# Save
write.csv(container, paste0(wd_res, '/tables/tabE11.csv'), row.names = F)

cat("\n====================\n")
cat("Saved Table E11")
cat("\n====================\n")
